Part I: Visualization
Heat Map of the two candidates and a mediator.Word Cloud consisting the text of two debates of two candidates.Sentiment Analysis (polarity and subjectivity) of two candidates.Part II: Prediction
Naive BayesLogistic RegressionSupport Vector Classificationimport numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# For analyzing text
import en_core_web_sm
nlp = en_core_web_sm.load()
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
from textblob import TextBlob
# For vis
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
first = pd.read_csv('us_election_2020_1st_presidential_debate.csv')
second = pd.read_csv('us_election_2020_2nd_presidential_debate.csv')
first.head()
second.head()
# summarize the null in the data
null_df = pd.DataFrame(pd.concat([first.isnull().sum(), second.isnull().sum()], axis = 1))
null_df.columns = ['first', 'second']
null_df
first.iloc[178:181,:]
first.loc[first.minute.isnull(), 'minute'] = '00:00'
first.iloc[178:181,:]
# changing their names for more simplicity and coherence in two datasets
print('names in the first dataset:', (first.speaker.unique()))
print('names in the second dataset:', (second.speaker.unique()))
first.loc[first.speaker.str.contains('Chris Wallace:'), 'speaker'] = 'Chris Wallace' # correcting the typo in the name
first.loc[first.speaker.str.contains('Vice President Joe Biden'), 'speaker'] = 'Joe Biden'
first.loc[first.speaker.str.contains('President Donald J. Trump'), 'speaker'] = 'Donald Trump'
first.loc[first.speaker.str.contains('Chris Wallace'), 'speaker'] = 'mediator_1'
second.loc[second.speaker.str.contains('Kristen Welker'), 'speaker'] = 'mediator_2'
print('Modified names in the first dataset:', (first.speaker.unique()))
print('Modified names in the second dataset:', (second.speaker.unique()))
# making the time consecutive
# First Debate
first['seconds'] = 80 # give initial value of 80 because first sentence is in 1:20
# (first.minute,1): specify the starting index of the counter(https://book.pythontips.com/en/latest/enumerate.html)
for i, tm in enumerate(first.minute,1):
timeParts = [int(s) for s in str(tm).split(':')]
if i <= 179:
first['seconds'][i-1] = timeParts[0] * 60 + timeParts[1]
if i > 179 and i <= 724:
first['seconds'][i-1] = timeParts[0] * 60 + timeParts[1] + first['seconds'][178]
if i > 724:
first['seconds'][i-1] = (timeParts[0] * 60 + timeParts[1]) * 60 + timeParts[2] + first['seconds'][178]
# Second Debate
second['seconds'] = 18 # give initial value of 80 because first sentence is in 0:18
for i, tm in enumerate(second.minute,1):
timeParts = [int(s) for s in str(tm).split(':')]
if i <= 89:
second['seconds'][i-1] = timeParts[0] * 60 + timeParts[1]
if i <= 337 and i > 89:
second['seconds'][i-1] = timeParts[0] * 60 + timeParts[1] + second['seconds'][88]
if i > 337:
second['seconds'][i-1] = timeParts[0] * 60 + timeParts[1] + second['seconds'][336]
first['minutes'] = first.seconds.apply(lambda x:x//60)
second['minutes'] = second.seconds.apply(lambda x:x//60)
# We use this format of %h:%m:%s by using the following command
first['time'] = first.seconds.apply(lambda x:str(datetime.timedelta(seconds=x)))
second['time'] = second.seconds.apply(lambda x:str(datetime.timedelta(seconds=x)))
# column 'seconds', 'minutes', 'time' are all the time when people begin to speak
first.head()
second.head()
heat = first.groupby(['minutes', 'speaker']).count().reset_index()
fig = go.Figure(data=go.Heatmap(
z=heat.minute,
x=heat.minutes,
y=heat.speaker,
colorscale='sunset', #https://plotly.com/python/builtin-colorscales/
colorbar=dict(
title="Heat of the discussion",
titleside="top",
tickmode="array",
tickvals=[1, 4, 10],
ticktext=["Cool", "Normal", "Hot"],
ticks="outside"
)
))
fig.update_layout(title='First Debate: # of times each one talks in each minute',
xaxis_nticks=36)
fig.show()
# Create and show figure
heat2 = second.groupby(['minutes', 'speaker']).count().reset_index()
fig2 = go.Figure(data=go.Heatmap(
z=heat2.minute,
x=heat2.minutes,
y=heat2.speaker,
colorscale='sunset',
colorbar=dict(
title="Heat of the discussion",
titleside="top",
tickmode="array",
tickvals=[2, 5, 8],
ticktext=["Cool", "Normal", "Hot"],
ticks="outside"
)
))
fig2.update_layout(title='Second Debate: # of times each one talks in each minute',
xaxis_nticks=36)
fig2.show()
# Create and show figure
Biden1 = first[first.speaker=='Joe Biden']
Biden2 = second[second.speaker=='Joe Biden']
Trump1 = first[first.speaker=='Donald Trump']
Trump2 = second[first.speaker=='Donald Trump']
# combine 2 debates
Biden_text = pd.concat([Biden1.text, Biden2.text], axis = 0)
Trump_text = pd.concat([Trump1.text, Trump2.text], axis = 0)
# change list to str
Biden_text = " ".join(txt for txt in Biden_text)
Trump_text = " ".join(txt for txt in Trump_text)
def textclean(text):
# tokenization
words=word_tokenize(text)
# lower the word
words_lower=[w.lower() for w in words]
# remove punctuation
table=str.maketrans('','',string.punctuation)
strpp=[w.translate(table) for w in words_lower]
Words_lower=[word for word in strpp if word.isalpha()]
# remove stopwords
stop_words=set(stopwords.words('english'))
Words_lower=[w for w in Words_lower if not w in stop_words]
# lemmatize verbs and nouns (https://blog.csdn.net/weixin_33963594/article/details/88726982?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control)
wordnet_lemmatizer = WordNetLemmatizer()
Words_lower = [wordnet_lemmatizer.lemmatize(w, 'v') for w in Words_lower]
Words_lower = [wordnet_lemmatizer.lemmatize(w, 'n') for w in Words_lower]
return(Words_lower)
# Create and generate a word cloud image:
# ' '.join(Words_lower): change to str
wordcloud = WordCloud(width = 600, height = 400, background_color="white").generate(' '.join(textclean(Biden_text)))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# Create and generate a word cloud image:
# ' '.join(Words_lower): change to str
wordcloud = WordCloud(width = 600, height = 400, background_color="white").generate(' '.join(textclean(Trump_text)))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# remove some meaningless words from the plotted word cloud
customized_remove_string = ['see', 'go', 'want', 'know', 'think', 'way', 'make', 'take', 'thing', 'say', 'let']
textclean_Trump_text =[w for w in textclean(Trump_text) if not w in customized_remove_string]
textclean_Biden_text =[w for w in textclean(Biden_text) if not w in customized_remove_string]
# replot the word cloud using masks of two candidates' picture
mask = np.array(Image.open("Trump.png"))
wordcloud = WordCloud(background_color="white", mode="RGBA", max_words=1000, mask=mask).generate(' '.join(textclean_Trump_text))
# create coloring from image
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[7,7])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
# store to file
# plt.savefig("Trump_wordcloud.png", format="png", dpi=200)
plt.show()
mask = np.array(Image.open("Biden.png"))
wordcloud = WordCloud(background_color="white", mode="RGBA", max_words=1000, mask=mask).generate(' '.join(textclean_Biden_text))
# create coloring from image
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[7,7])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
# store to file
# plt.savefig("Biden_wordcloud.png", format="png", dpi=200)
plt.show()
Compute polarity & subjectivity of each sentence.
polarity: negative vs. positive (-1.0 => +1.0)
subjectivity: objective vs. subjective (+0.0 => +1.0)
Group the polarity based on the value by:
negative [-1.0, -0.6]
somewhat negative (-0.6, -0.2]
neutral (-0.2, 0.2]
somewhat positive (0.2, 0.6]
positive (0.6, 1.0]
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
# number of sentences used by each person, each time their allowed to talk
first['number_of_sents'] = first.text.apply(lambda x:len(sent_detector.tokenize(x)))
second['number_of_sents'] = second.text.apply(lambda x:len(sent_detector.tokenize(x)))
# number of sentences in each cell
lens_1 = first.number_of_sents
lens_2 = second.number_of_sents
# making a long list of all sentences
list_1 =[]
list_2 = []
for x in first.text.apply(lambda x:sent_detector.tokenize(x)):
list_1.extend(x)
for x in second.text.apply(lambda x:sent_detector.tokenize(x)):
list_2.extend(x)
# create new dataframes, repeating as appropriate
first_sent = pd.DataFrame({'speaker': np.repeat(first.speaker, lens_1),
'time': np.repeat(first.time, lens_1),
'sent': list_1})
second_sent = pd.DataFrame({'speaker': np.repeat(second.speaker, lens_2),
'time': np.repeat(second.time, lens_2),
'sent': list_2})
first_sent.head()
# first df
# compute polarity & subjectivity
first_sent['polarity'] = first_sent.sent.apply(lambda x: TextBlob(x).polarity)
first_sent['subjectivity'] = first_sent.sent.apply(lambda x: TextBlob(x).subjectivity)
# group the polarity
first_sent['sentiment'] = first_sent.polarity.apply(lambda x: 'positive' if x>0.6 else 'somewhat positive' if x>0.2 else 'neutral' if x>-0.2 else 'somewhat negative' if x>-0.6 else 'negative')
# second df
# compute polarity & subjectivity
second_sent['polarity'] = second_sent.sent.apply(lambda x: TextBlob(x).polarity)
second_sent['subjectivity'] = second_sent.sent.apply(lambda x: TextBlob(x).subjectivity)
# group the polarity
second_sent['sentiment'] = second_sent.polarity.apply(lambda x: 'positive' if x>0.6 else 'somewhat positive' if x>0.2 else 'neutral' if x>-0.2 else 'somewhat negative' if x>-0.6 else 'negative')
first_sent.reset_index(drop = True, inplace = True)
second_sent.reset_index(drop = True, inplace = True)
# plot the pie chart showing the percentage of different groups of words each candidate uses
summery_sentiment_first = first_sent.groupby(['speaker', 'sentiment']).count().reset_index()
Trump_sentiment_first = summery_sentiment_first.loc[summery_sentiment_first.speaker == "Donald Trump"].polarity
labels = 'somewhat positive/negative', 'neutral', 'positive/negative'
sizes = [Trump_sentiment_first[3] + Trump_sentiment_first[4], Trump_sentiment_first[1], Trump_sentiment_first[0] + Trump_sentiment_first[2]]
colors = ['yellowgreen', 'lightskyblue', 'lightcoral']
explode = (0, 0, 0)
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Polarity analysis of Trump first debate')
plt.axis('equal')
plt.show()
Biden_sentiment_first = summery_sentiment_first.loc[summery_sentiment_first.speaker == "Joe Biden"].polarity
labels = 'somewhat positive/negative', 'neutral', 'positive/negative'
sizes = [Biden_sentiment_first[8] + Biden_sentiment_first[9], Biden_sentiment_first[6], Biden_sentiment_first[5] + Biden_sentiment_first[7]]
colors = ['yellowgreen', 'lightskyblue', 'lightcoral']
explode = (0, 0, 0)
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Polarity analysis of Biden first debate')
plt.axis('equal')
plt.show()
# sort out the the negative and positive sentences
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True, )
first_sent.loc[(first_sent['polarity']>=0.6) | (first_sent['polarity']<=-0.6),['speaker', 'sent', 'polarity']]\
.head(15).style.background_gradient(cmap, subset=['polarity'])
# plot the histogram showing the distribution of each candidate's subjectivity
both = pd.concat([first_sent, second_sent], axis = 0)
fig = go.Figure()
fig.add_trace(go.Histogram(
x=both[both.speaker == 'Donald Trump'].subjectivity,
name='Trump', xbins=dict(start=-1, end=2, size=0.1),
marker_color='red', opacity=0.75))
fig.add_trace(go.Histogram(
x=both[both.speaker == 'Joe Biden'].subjectivity,
name='Biden', xbins=dict(start=-1, end=2, size=0.1),
marker_color='#3498DB', opacity=0.75))
fig.update_layout(
title_text="Number of Sentences used by Debaters with different Subjectivities",
yaxis_title_text='Number of Sentences',
xaxis_title_text='Subjectivity',
bargap=0.1, bargroupgap=0.1)
import sklearn
from sklearn.model_selection import train_test_split
# Encode target labels with value between 0 and n_classes-1
from sklearn.preprocessing import LabelEncoder
# AUC is in fact often preferred over accuracy for binary classification:
# https://datascience.stackexchange.com/questions/806/advantages-of-auc-vs-standard-accuracy#:~:text=AUC%20and%20accuracy%20are%20fairly%20different%20things.&text=For%20a%20given%20choice%20of,is%20already%20measuring%20something%20else.
# https://www.quora.com/Why-is-AUC-a-better-measure-of-an-algorithms-performance-than-accuracy
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report, accuracy_score
# CountVectorizer will convert a collection of text documents to a sparse matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
both1 = both.copy()
Trump_sent =both1.loc[both1['speaker'] == 'Donald Trump']
Biden_sent =both1.loc[both1['speaker'] == 'Joe Biden']
two_sent = pd.concat([Trump_sent, Biden_sent], axis = 0)
X = two_sent['sent']
y = two_sent['speaker']
# Trump belongs to class 0 and Biden belongs to class 1
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded,
test_size=0.2,
random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
cv = CountVectorizer()
X_train_vectorized = cv.fit_transform(X_train)
X_train = X_train_vectorized
X_test_vectorized = cv.transform(X_test)
X_test = X_test_vectorized
# CountVectorizer has already been fitted with the training data.
# So for your test data, you just want to call transform(), not fit_transform()
# https://stackoverflow.com/questions/45804133/dimension-mismatch-error-in-countvectorizer-multinomialnb
NB = MultinomialNB(alpha=0.1)
NB.fit(X_train, y_train)
#making predictions & looking at Accuracy score
predictions = NB.predict(X_test)
print('Accuracy score:',accuracy_score(y_test, predictions))
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
#
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(pd.DataFrame(confusion_matrix(y_test, predictions),
columns=['Predicted Trump', "Predicted Biden"], index=['Actual Trump', 'Actual Biden']))
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')
print(f'\nAccuracy: { ((tp + tn) / (tp + tn + fp + fn))}')
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
print(confusion_matrix(y_test, predictions))
print('------------------------------------------------------')
print(classification_report(y_test, predictions))
def gridSearchCV(model, params):
"""
@param model: sklearn estimator
@param params (dict): Dictionary of possible parameters
"""
model_cv = GridSearchCV(model, param_grid=params, scoring='roc_auc', cv=5)
model_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)[['params', 'mean_test_score']]
tuned_param = "Tuned Parameters: {}".format(model_cv.best_params_)
best_score = "Best score is {}".format(model_cv.best_score_)
return cv_results,tuned_param,best_score
def evaluate(model, plotROC=False):
"""
1. Plot ROC AUC of the test set
2. Return the best threshold
"""
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
auc_score = roc_auc_score(y_test, preds)
print(f'AUC: {auc_score:.4f}')#rounding digit
# Find optimal threshold
rocDf = pd.DataFrame({'fpr': fpr, 'tpr':tpr, 'threshold':threshold})
rocDf['tpr - fpr'] = rocDf.tpr - rocDf.fpr
OptimalThreshold = rocDf.threshold[rocDf['tpr - fpr'].idxmax()]
print(f'OptimalThreshold: {OptimalThreshold:.4f}')
# Get accuracy over the test set
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f'Accuracy: {accuracy*100:.2f}%')
# Plot ROC AUC
if plotROC:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc_score)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
params = {'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,\
1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1,2.2,2.3,2.4]}
NB = MultinomialNB()
gridSearchCV(NB, params)
evaluate(MultinomialNB(alpha=0.5), plotROC=True)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
lr = LogisticRegression()
gridSearchCV(lr,params)
evaluate(LogisticRegression(C = 1), plotROC=True)
params = {'gamma':[0.1, 1, 10, 100],
'C':[0.1, 1, 10, 100, 1000]}
clf = svm.SVC()
gridSearchCV(clf,params)
evaluate(svm.SVC(C = 10, gamma = 0.1, probability=True), plotROC=True)
label = ['Naive Bayes', 'Logistic Regression', 'Support Vector Classification']
auclist = [0.8519, 0.8289, 0.7762]
#generates an array of length label and use it on the X-axis
def plot_bar_x():
# this is for plotting purpose
index = np.arange(len(label))
clrs = ['grey' if (x < max(auclist)) else 'red' for x in auclist ]
g=sns.barplot(x=index, y=auclist, palette=clrs) # color=clrs)
plt.xlabel('Model type', fontsize=10)
plt.ylabel('AUC score', fontsize=10)
plt.xticks(index, label, fontsize=10, rotation=30)
plt.title('AUC score for each fitted model')
ax=g
for p in ax.patches:
ax.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', fontsize=11, color='gray', xytext=(0, 20),
textcoords='offset points')
g.set_ylim(0,1.25) #To make space for the annotations
plot_bar_x()